In [ ]:
import os, sys, psutil, difflib
from collections import OrderedDict
import numpy as np
import scipy as sp
import pandas as pd
import gseapy as gp
from sklearn.preprocessing import LabelEncoder, MultiLabelBinarizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from bionlp.util import fs, io, func, njobs
from bionlp import txtclf
LABEL2IDX = {'gene perturbation':2, 'drug perturbation':1, 'disease signature':0}
LABEL2OBJ = {'gene perturbation':'hs_gene_symbol', 'drug perturbation':'drug_name', 'disease signature':'disease_name'}
RUN_LABEL = 'drug perturbation'
_RUN_LABEL = RUN_LABEL.replace(' ', '_')
DGE_METHOD = 'limma-fdr'
DATA_PATH = '../../data/gesgnext'
GE_PATH = '../../data/gesgnext/gedata/%s' % _RUN_LABEL
DGE_PATH = '../../data/gesgnext/dge/%s/%s' % (DGE_METHOD, _RUN_LABEL)
DGE_DATA_PATH = '../../data/gesgnext/dge/%s' % _RUN_LABEL
# DGE_CACHE_PATH = '../../data/gesgnext/dge/cache/%s/%s' % (_RUN_LABEL, DGE_METHOD)
GEO_PATH = '../../data/gesgnext/geo'
GSE_DIR = '../../data/gesgnext/geo/xml/%s' % _RUN_LABEL
SAMP_DIR = '../../data/gesgnext/geo/xml/%s/samples' % _RUN_LABEL
PLATFORM_PATH = '../../data/gesgnext/geo/xml/%s/platforms' % _RUN_LABEL
SGNDB_PATH = '../../data/gesgnext/sgndb/%s' % _RUN_LABEL
WIKIPATHWAYS_PATH = '../../data/gesgnext/wikipathways'
# probe_gene_map = io.read_obj(os.path.join(PLATFORM_PATH, 'probe_gene_map.pkl'))
probe_gene_map = None
SGN_MIN_SIZE, SGN_MAX_SIZE = 5, 100
SC=' /// '
In [ ]:
# Signatures
# sgn_df = pd.read_csv(os.path.join(DATA_PATH, '%s.csv'%RUN_LABEL.replace(' ', '_')))
sgn_df = pd.read_excel(os.path.join(DATA_PATH, '%s.xlsx'%RUN_LABEL.replace(' ', '_')))
# Differential gene expression
# dge_dfs = [io.read_df(os.path.join(DGE_PATH, fname), with_idx=True) for fname in ['dge_%s.npz'%x for x in range(sgn_df.shape[0])]]
dge_dfs = [io.read_df(os.path.join(DGE_PATH, fname), with_idx=True) for fname in ['dge_%s.npz'%sgn_id for sgn_id in sgn_df['id']]]
# dge_dfs = [io.read_df(os.path.join(DGE_PATH, 'dge_%s.npz'%sgn_id.split(':')[-1]), with_idx=True) for sgn_id in sgn_df['id']]
# dge_dfs = [io.read_df(os.path.join(DGE_CACHE_PATH, '%s.npz'%sgn_id)) for sgn_id in sgn_df['id']]
for geo_id, sgn_ids in sgn_df.groupby('geo_id').groups.iteritems():
# Training data for classifier
sub_sgn_df = sgn_df.loc[sgn_ids]
sub_dge_dfs = [dge_dfs[i] for i in sgn_ids]
dge_X = pd.concat([dge_df['statistic'].to_frame() for dge_df in sub_dge_dfs], axis=1, join='inner')
# dge_X = pd.concat([dge_df['t'].to_frame() for dge_df in sub_dge_dfs], axis=1, join='inner')
dge_X.columns = sub_sgn_df['id']
dge_X = dge_X.transpose()
io.write_df(dge_X, os.path.join(DGE_DATA_PATH, 'dge_X_%s.npz'%geo_id), with_idx=True, compress=True)
# Label Construction
mlb = MultiLabelBinarizer()
bin_label = (mlb.fit_transform(sub_sgn_df[LABEL2OBJ[RUN_LABEL]].apply(str).as_matrix().reshape(-1,1)), mlb.classes_)
io.write_df(pd.DataFrame(bin_label[0], index=dge_X.index, columns=bin_label[1]), os.path.join(DGE_DATA_PATH, 'dge_Y_%s.npz'%geo_id), with_idx=True, sparse_fmt='csr', compress=True)
le = LabelEncoder()
encoded_lb = (le.fit_transform(sub_sgn_df[LABEL2OBJ[RUN_LABEL]].apply(str).as_matrix()), le.classes_)
io.write_df(pd.DataFrame(encoded_lb[0], index=dge_X.index, columns=[';'.join(['%i:%s'%(i,x) for i, x in enumerate(encoded_lb[1])])]), os.path.join(DGE_DATA_PATH, 'dge_ecY_%s.npz'%geo_id), with_idx=True, compress=True)
del dge_X, bin_label, encoded_lb
In [ ]:
def sgn2dgeg(groups, sgn_df, dge_dir, dgeg_dir):
for geo_id, sgn_ids in groups:
# Training data for classifier
sub_sgn_df = sgn_df.loc[sgn_ids]
# sub_dge_dfs = [dge_dfs[i] for i in sgn_ids]
sub_dge_dfs = [io.read_df(os.path.join(dge_dir, fname), with_idx=True) for fname in ['dge_%s.npz'%sgn_id for sgn_id in sub_sgn_df['id']]]
dge_X = pd.concat([dge_df['statistic'].to_frame() for dge_df in sub_dge_dfs], axis=1, join='inner')
dge_X.columns = sub_sgn_df['id']
dge_X = dge_X.transpose()
io.write_df(dge_X, os.path.join(dgeg_dir, 'dge_X_%s.npz'%geo_id), with_idx=True, compress=True)
# Label Construction
mlb = MultiLabelBinarizer()
bin_label = (mlb.fit_transform(sub_sgn_df[LABEL2OBJ[RUN_LABEL]].apply(str).as_matrix().reshape(-1,1)), mlb.classes_)
io.write_df(pd.DataFrame(bin_label[0], index=dge_X.index, columns=bin_label[1]), os.path.join(dgeg_dir, 'dge_Y_%s.npz'%geo_id), with_idx=True, sparse_fmt='csr', compress=True)
le = LabelEncoder()
encoded_lb = (le.fit_transform(sub_sgn_df[LABEL2OBJ[RUN_LABEL]].apply(str).as_matrix()), le.classes_)
io.write_df(pd.DataFrame(encoded_lb[0], index=dge_X.index, columns=[';'.join(['%i:%s'%(i,x) for i, x in enumerate(encoded_lb[1])])]), os.path.join(DGE_DATA_PATH, 'dge_ecY_%s.npz'%geo_id), with_idx=True, compress=True)
del dge_X, bin_label, encoded_lb
sgn_df = pd.read_excel(os.path.join(DATA_PATH, '%s.xlsx'%RUN_LABEL.replace(' ', '_')))
groups = sgn_df.groupby('geo_id').groups.items()
numprocs = psutil.cpu_count()
task_bnd = njobs.split_1d(len(groups), split_num=numprocs, ret_idx=True)
_ = njobs.run_pool(sgn2dgeg, n_jobs=numprocs, dist_param=['groups'], groups=[groups[task_bnd[i]:task_bnd[i+1]] for i in range(numprocs)], sgn_df=sgn_df, dge_dir=DGE_PATH, dgeg_dir=DGE_DATA_PATH)
In [ ]:
sgn_df = pd.read_excel(os.path.join(DATA_PATH, '%s.xlsx' % _RUN_LABEL))
idx_sgn_df = sgn_df.set_index('id')
probe_gene_map = io.read_obj(os.path.join(PLATFORM_PATH, 'probe_gene_map.pkl'))
keep_unkown_probe, hist_bnd = False, (-2, 1)
udr_genes = []
for dge_X_fpath in fs.listf(DGE_DATA_PATH, pattern='dge_X_.*\.npz', full_path=True):
dge_X = io.read_df(dge_X_fpath, with_idx=True).replace([np.inf, -np.inf], np.nan).fillna(0)
if (np.all(pd.isnull(dge_X.as_matrix()))): continue
# Filter out the probes that cannot be converted to gene symbols
plfm = idx_sgn_df['platform'].loc[dge_X.index[0]]
has_plfm = probe_gene_map and probe_gene_map.has_key(plfm) and not probe_gene_map[plfm].empty
if (has_plfm and not keep_unkown_probe):
pgmap = probe_gene_map[plfm]
columns = [col for col in dge_X.columns if col in pgmap.index and pgmap.loc[col] and not pgmap.loc[col].isspace()]
dge_X = dge_X[columns]
hist, bin_edges = zip(*[np.histogram(dge_X.iloc[i]) for i in range(dge_X.shape[0])])
uprg = [dge_X.iloc[i, np.where(dge_X.iloc[i] >= bin_edges[i][hist_bnd[0]])[0]].sort_values(ascending=False) for i in range(dge_X.shape[0])]
dwrg = [dge_X.iloc[i, np.where(dge_X.iloc[i] <= bin_edges[i][hist_bnd[1]])[0]].sort_values(ascending=True) for i in range(dge_X.shape[0])]
upr_genes, dwr_genes = [x.index.tolist() for x in uprg], [x.index.tolist() for x in dwrg]
upr_dges, dwr_dges = [x.tolist() for x in uprg], [x.tolist() for x in dwrg]
del uprg, dwrg
# Map to Gene Symbol
if (has_plfm):
pgmap = probe_gene_map[plfm]
upr_genes = [[[x.strip() for x in pgmap.loc[probe].split('///')] if (probe in pgmap.index) else [probe] for probe in probes] for probes in upr_genes]
uprg_lens = [[len(x) for x in genes] for genes in upr_genes]
upr_dges = [[[dge] * length for dge, length in zip(dges, lens)] for dges, lens in zip(upr_dges, uprg_lens)]
upr_genes = [func.flatten_list(probes) for probes in upr_genes]
upr_dges = [func.flatten_list(dges) for dges in upr_dges]
dwr_genes = [[[x.strip() for x in pgmap.loc[probe].split('///')] if (probe in pgmap.index) else [probe] for probe in probes] for probes in dwr_genes]
dwrg_lens = [[len(x) for x in genes] for genes in dwr_genes]
dwr_dges = [[[dge] * length for dge, length in zip(dges, lens)] for dges, lens in zip(dwr_dges, dwrg_lens)]
dwr_genes = [func.flatten_list(probes) for probes in dwr_genes]
dwr_dges = [func.flatten_list(dges) for dges in dwr_dges]
udr_genes.append(pd.DataFrame(OrderedDict([('up_regulated_genes', ['|'.join(map(str, x[:SGN_MAX_SIZE])) for x in upr_genes]), ('down_regulated_genes', ['|'.join(map(str, x[-SGN_MAX_SIZE:])) for x in dwr_genes]), ('up_regulated_dges', ['|'.join(map(str, x[:SGN_MAX_SIZE])) for x in upr_dges]), ('down_regulated_dges', ['|'.join(map(str, x[-SGN_MAX_SIZE:])) for x in dwr_dges])]), index=dge_X.index))
del upr_genes, dwr_genes, upr_dges, dwr_dges
if (has_plfm): del uprg_lens, dwrg_lens
new_sgn_df = pd.concat([idx_sgn_df, pd.concat(udr_genes, axis=0, join='inner')], axis=1, join_axes=[idx_sgn_df.index])
new_sgn_fpath = os.path.join(DATA_PATH, '%s_udrg.xlsx' % _RUN_LABEL)
io.write_df(new_sgn_df, new_sgn_fpath, with_idx=True)
new_sgn_df.to_excel(new_sgn_fpath, encoding='utf8')
In [ ]:
def dge2udrg(sgn_dge_fpaths, sgn_df, probe_gene_map, keep_unkown_probe=False, hist_bnd=(-2, 1)):
udr_genes = []
for sgn_dge_fpath in sgn_dge_fpaths:
sgn_dge = io.read_df(sgn_dge_fpath, with_idx=True).replace([np.inf, -np.inf], np.nan).fillna(0)
sgn_dge = sgn_dge.loc[[x for x in sgn_dge.index if x in sgn_df.index]]
if (np.all(pd.isnull(sgn_dge))): continue
# Filter out the probes that cannot be converted to gene symbols
plfm = sgn_df['platform'].loc[sgn_dge.index[0]]
has_plfm = probe_gene_map and probe_gene_map.has_key(plfm) and not probe_gene_map[plfm].empty
if (has_plfm and not keep_unkown_probe):
pgmap = probe_gene_map[plfm]
columns = [col for col in sgn_dge.columns if col in pgmap.index and pgmap.loc[col] and not pgmap.loc[col].isspace()]
sgn_dge = sgn_dge[columns]
hist, bin_edges = zip(*[np.histogram(sgn_dge.iloc[i]) for i in range(sgn_dge.shape[0])])
uprg = [sgn_dge.iloc[i, np.where(sgn_dge.iloc[i] >= bin_edges[i][hist_bnd[0]])[0]].sort_values(ascending=False) for i in range(sgn_dge.shape[0])]
dwrg = [sgn_dge.iloc[i, np.where(sgn_dge.iloc[i] <= bin_edges[i][hist_bnd[1]])[0]].sort_values(ascending=True) for i in range(sgn_dge.shape[0])]
upr_genes, dwr_genes = [x.index.tolist() for x in uprg], [x.index.tolist() for x in dwrg]
upr_dges, dwr_dges = [x.tolist() for x in uprg], [x.tolist() for x in dwrg]
del uprg, dwrg
# Map to Gene Symbol
if (has_plfm):
pgmap = probe_gene_map[plfm]
upr_genes = [[[x.strip() for x in pgmap.loc[probe].split('///')] if (probe in pgmap.index) else [probe] for probe in probes] for probes in upr_genes]
uprg_lens = [[len(x) for x in genes] for genes in upr_genes]
upr_dges = [[[dge] * length for dge, length in zip(dges, lens)] for dges, lens in zip(upr_dges, uprg_lens)]
upr_genes = [func.flatten_list(probes) for probes in upr_genes]
upr_dges = [func.flatten_list(dges) for dges in upr_dges]
dwr_genes = [[[x.strip() for x in pgmap.loc[probe].split('///')] if (probe in pgmap.index) else [probe] for probe in probes] for probes in dwr_genes]
dwrg_lens = [[len(x) for x in genes] for genes in dwr_genes]
dwr_dges = [[[dge] * length for dge, length in zip(dges, lens)] for dges, lens in zip(dwr_dges, dwrg_lens)]
dwr_genes = [func.flatten_list(probes) for probes in dwr_genes]
dwr_dges = [func.flatten_list(dges) for dges in dwr_dges]
filtered_ids = []
for sid, uprg, dwrg in zip(sgn_dge.index, upr_genes, dwr_genes):
if (len(uprg) < SGN_MIN_SIZE and len(dwrg) < SGN_MIN_SIZE):
filtered_ids.append(sid)
udr_genes.append(pd.DataFrame(OrderedDict([('up_regulated_genes', ['|'.join(map(str, x[:SGN_MAX_SIZE])) for x in upr_genes]), ('down_regulated_genes', ['|'.join(map(str, x[-SGN_MAX_SIZE:])) for x in dwr_genes]), ('up_regulated_dges', ['|'.join(map(str, x[:SGN_MAX_SIZE])) for x in upr_dges]), ('down_regulated_dges', ['|'.join(map(str, x[-SGN_MAX_SIZE:])) for x in dwr_dges])]), index=sgn_dge.index).loc[[sid for sid in sgn_dge.index if sid not in filtered_ids]])
del upr_genes, dwr_genes, upr_dges, dwr_dges
if (has_plfm): del uprg_lens, dwrg_lens
return pd.concat(udr_genes, axis=0, join='inner')
sgn_df = pd.read_excel(os.path.join(DATA_PATH, '%s.xlsx' % _RUN_LABEL))
idx_sgn_df = sgn_df.set_index('id')
keep_unkown_probe, hist_bnd = False, (-4, 3)
numprocs = psutil.cpu_count()
sgn_dge_fpaths = fs.listf(DGE_DATA_PATH, pattern='dge_X_.*\.npz', full_path=True)
task_bnd = njobs.split_1d(len(sgn_dge_fpaths), split_num=numprocs, ret_idx=True)
# udr_genes = dge2udrg(sgn_dge_fpaths=sgn_dge_fpaths, sgn_df=idx_sgn_df, probe_gene_map=probe_gene_map, keep_unkown_probe=keep_unkown_probe, hist_bnd=hist_bnd)
udr_genes = njobs.run_pool(dge2udrg, n_jobs=numprocs, dist_param=['sgn_dge_fpaths'], sgn_dge_fpaths=[sgn_dge_fpaths[task_bnd[i]:task_bnd[i+1]] for i in range(numprocs)], sgn_df=idx_sgn_df, probe_gene_map=probe_gene_map, keep_unkown_probe=keep_unkown_probe, hist_bnd=hist_bnd)
new_sgn_df = pd.concat([idx_sgn_df, pd.concat(udr_genes, axis=0, join='inner')], axis=1, join_axes=[idx_sgn_df.index])
new_sgn_fpath = os.path.join(DATA_PATH, '%s_udrg.xlsx' % _RUN_LABEL)
io.write_df(new_sgn_df, new_sgn_fpath, with_idx=True)
new_sgn_df.to_excel(new_sgn_fpath, encoding='utf8')
In [ ]:
def gen_sgndb(groups, udrg_sgn_df, sgndb_path):
for geo_id, sgn_ids in groups:
sub_sgn_df = udrg_sgn_df.loc[sgn_ids]
# Combined signature database
# db_content = '\n'.join(['%s\t%s\t%s' % (idx, ('%s:%s:%s'%(row['organism'], row['cell_type'], row[LABEL2OBJ[RUN_LABEL]])).replace(' ', '_'), '\t'.join(row['up_regulated_genes'].split('|')+row['down_regulated_genes'].split('|'))) for idx, row in sub_sgn_df.iterrows()])
# fs.write_file(db_content, os.path.join(sgndb_path, '%s.gmt'%geo_id), code='utf-8')
# del db_content
# Up-regulated signature database
up_db_content = '\n'.join(['%s\t%s\t%s' % (idx, ('%s:%s:%s'%(row['organism'], row['cell_type'], row[LABEL2OBJ[RUN_LABEL]])).replace(' ', '_'), '\t'.join(row['up_regulated_genes'].split('|'))) for idx, row in sub_sgn_df.iterrows()])
fs.write_file(up_db_content, os.path.join(sgndb_path, '%s_up.gmt'%geo_id), code='utf-8')
del up_db_content
# Down-regulated signature database
down_db_content = '\n'.join(['%s\t%s\t%s' % (idx, ('%s:%s:%s'%(row['organism'], row['cell_type'], row[LABEL2OBJ[RUN_LABEL]])).replace(' ', '_'), '\t'.join(row['down_regulated_genes'].split('|'))) for idx, row in sub_sgn_df.iterrows()])
fs.write_file(down_db_content, os.path.join(sgndb_path, '%s_down.gmt'%geo_id), code='utf-8')
del down_db_content
# print [len(row['up_regulated_genes'].split('|')) for idx, row in sub_sgn_df.iterrows()]
# print [len(row['down_regulated_genes'].split('|')) for idx, row in sub_sgn_df.iterrows()]
fs.mkdir(SGNDB_PATH)
udrg_sgn_df = io.read_df(os.path.join(DATA_PATH, '%s_udrg.xlsx'%RUN_LABEL.replace(' ', '_')), with_idx=True).dropna()
groups = udrg_sgn_df.groupby('geo_id').groups.items()
numprocs = psutil.cpu_count()
task_bnd = njobs.split_1d(len(groups), split_num=numprocs, ret_idx=True)
_ = njobs.run_pool(gen_sgndb, n_jobs=numprocs, dist_param=['groups'], groups=[groups[task_bnd[i]:task_bnd[i+1]] for i in range(numprocs)], udrg_sgn_df=udrg_sgn_df, sgndb_path=SGNDB_PATH)
In [ ]:
def gsea(groups, udrg_sgn_df, probe_gene_map, sgndb_path, sample_path, method='signal_to_noise', permt_type='phenotype', permt_num=100, min_size=15, max_size=500, out_dir='gsea_output', keep_unkown_probe=False, fmt='xml', numprocs=1):
if (fmt == 'soft'):
from bionlp.spider import geo
else:
from bionlp.spider import geoxml as geo
for geo_id, sgn_ids in groups:
# Select the sub signature table
sub_sgn_df = udrg_sgn_df.loc[sgn_ids]
ids = sub_sgn_df['id'] if hasattr(sub_sgn_df, 'id') else sub_sgn_df.index
# Prepair the gene expression profile and the perturbation labels
pert_ids, ctrl_ids = list(set('|'.join(sub_sgn_df['pert_ids']).split('|'))), list(set('|'.join(sub_sgn_df['ctrl_ids']).split('|')))
pert_geo_docs, ctrl_geo_docs = geo.parse_geos([os.path.join(sample_path, '.'.join([pert_id, fmt])) for pert_id in pert_ids], view='full', type='gsm', fmt=fmt), geo.parse_geos([os.path.join(sample_path, '.'.join([ctrl_id, fmt])) for ctrl_id in ctrl_ids], view='full', type='gsm', fmt=fmt)
pert_ge_dfs, ctrl_ge_dfs = [geo_doc['data']['VALUE'] for geo_doc in pert_geo_docs], [geo_doc['data']['VALUE'] for geo_doc in ctrl_geo_docs]
pert_df, ctrl_df = pd.concat(pert_ge_dfs, axis=1, join='inner').astype('float32'), pd.concat(ctrl_ge_dfs, axis=1, join='inner').astype('float32')
pert_lb, ctrl_lb, class_vec = 'pert', 'ctrl', ['pert'] * pert_df.shape[1] + ['ctrl'] * ctrl_df.shape[1]
join_df = pd.concat([pert_df, ctrl_df], axis=1, join='inner')
join_df.columns = pert_ids + ctrl_ids
del pert_geo_docs, ctrl_geo_docs, pert_ge_dfs[:], ctrl_ge_dfs[:], pert_df, ctrl_df
# Map the probes to gene symbols
plfm = sub_sgn_df['platform'].iloc[0]
if (probe_gene_map and probe_gene_map.has_key(plfm) and not probe_gene_map[plfm].empty):
pgmap = probe_gene_map[plfm]
if (not keep_unkown_probe):
probes = [idx for idx in join_df.index if idx in pgmap.index and pgmap.loc[idx] and not pgmap.loc[idx].isspace()]
join_df = join_df.loc[probes]
join_df.index = [[x.strip() for x in pgmap.loc[probe].split('///')][0] if (probe in pgmap.index) else [probe] for probe in join_df.index]
join_df.reset_index(inplace=True)
join_df.rename(columns={'ID_REF': 'NAME'}, inplace=True)
join_df['NAME'] = join_df['NAME'].apply(str)
# Call the GSEA API
# try:
# if (not os.path.exists(os.path.join(out_dir,geo_id)) or (os.path.exists(os.path.join(out_dir,geo_id)) and len(fs.read_file(os.path.join(sgndb_path, '%s.gmt'%geo_id))) > len(fs.listf(os.path.join(out_dir,geo_id), pattern='.*\.gsea\.pdf')))):
# print 'doing '+geo_id
# gs_res = gp.gsea(data=join_df, gene_sets=os.path.join(sgndb_path, '%s.gmt'%geo_id), cls=class_vec, permutation_type=permt_type, permutation_num=permt_num, min_size=min_size, max_size=max_size, outdir=os.path.join(out_dir,geo_id), method=method, processes=numprocs, format='pdf')
# except Exception as e:
# print 'Error occured when conducting GSEA for up-regulated genes in %s!' % geo_id
# print e
try:
if (not os.path.exists(os.path.join(out_dir,geo_id+'up')) or (os.path.exists(os.path.join(out_dir,geo_id+'up')) and len(fs.read_file(os.path.join(sgndb_path, '%s_up.gmt'%geo_id))) > len(fs.listf(os.path.join(out_dir,geo_id+'up'), pattern='.*\.gsea\.pdf')))):
print 'doing '+geo_id+'_up'
gs_res = gp.gsea(data=join_df, gene_sets=os.path.join(sgndb_path, '%s_up.gmt'%geo_id), cls=class_vec, permutation_type=permt_type, permutation_num=permt_num, min_size=min_size, max_size=max_size, outdir=os.path.join(out_dir,geo_id+'up'), method=method, processes=numprocs, format='pdf')
except Exception as e:
print 'Error occured when conducting GSEA for up-regulated genes in %s!' % geo_id
print e
try:
if (not os.path.exists(os.path.join(out_dir,geo_id+'down')) or (os.path.exists(os.path.join(out_dir,geo_id+'down')) and len(fs.read_file(os.path.join(sgndb_path, '%s_down.gmt'%geo_id))) > len(fs.listf(os.path.join(out_dir,geo_id+'down'), pattern='.*\.gsea\.pdf')))):
print 'doing '+geo_id+'_down'
gs_res = gp.gsea(data=join_df, gene_sets=os.path.join(sgndb_path, '%s_down.gmt'%geo_id), cls=class_vec, permutation_type=permt_type, permutation_num=permt_num, min_size=min_size, max_size=max_size, outdir=os.path.join(out_dir,geo_id+'down'), method=method, processes=numprocs, format='pdf')
except Exception as e:
print 'Error occured when conducting GSEA for down-regulated genes in %s!' % geo_id
print e
del join_df
udrg_sgn_df = pd.read_excel(os.path.join(DATA_PATH, '%s_udrg.xlsx'%RUN_LABEL.replace(' ', '_')), index_col='id').dropna()
# udrg_sgn_df = udrg_sgn_df[udrg_sgn_df['geo_id'] == 'GSE10809']
method, permt_type, permt_num, keep_unkown_probe = 'signal_to_noise', 'phenotype', 100, False
out_dir = os.path.join('gsea', method, _RUN_LABEL)
# probe_gene_map = io.read_obj('probe_gene_map.pkl')
numprocs = psutil.cpu_count()
groups = udrg_sgn_df.groupby('geo_id').groups.items()
task_bnd = njobs.split_1d(len(groups), split_num=numprocs, ret_idx=True)
gsea(groups, udrg_sgn_df=udrg_sgn_df, probe_gene_map=probe_gene_map, sgndb_path=SGNDB_PATH, sample_path=SAMP_DIR, method=method, permt_type=permt_type, permt_num=permt_num, min_size=SGN_MIN_SIZE, max_size=SGN_MAX_SIZE, out_dir=out_dir, keep_unkown_probe=keep_unkown_probe, numprocs=numprocs)
# _ = njobs.run_pool(gsea, n_jobs=numprocs, dist_param=['groups'], groups=[groups[task_bnd[i]:task_bnd[i+1]] for i in range(numprocs)], udrg_sgn_df=udrg_sgn_df, probe_gene_map=probe_gene_map, sgndb_path=SGNDB_PATH, sample_path=SAMP_DIR, method=method, permt_type=permt_type, permt_num=permt_num, min_size=SGN_MIN_SIZE, max_size=SGN_MAX_SIZE, out_dir=out_dir, keep_unkown_probe=keep_unkown_probe, numprocs=1)
In [ ]:
udrg_sgn_df = pd.read_excel(os.path.join(DATA_PATH, '%s_udrg.xlsx'%RUN_LABEL.replace(' ', '_')), index_col='id')
method = 'signal_to_noise'
out_dir = os.path.join('gsea', method, _RUN_LABEL)
up_reports, down_reports = [], []
for geo_id, sgn_ids in udrg_sgn_df.groupby('geo_id').groups.items():
uprep_fpath, downrep_fpath = os.path.join(out_dir, geo_id+'up', 'gseapy.gsea.phenotype.report.csv'), os.path.join(out_dir, geo_id+'down', 'gseapy.gsea.phenotype.report.csv')
if (os.path.exists(uprep_fpath)):
up_reports.append(pd.read_csv(uprep_fpath).set_index('Term')[['es','pval','fdr']].rename(columns={'es':'up_es','pval':'up_pval','fdr':'up_fdr'}))
if (os.path.exists(downrep_fpath)):
down_reports.append(pd.read_csv(downrep_fpath).set_index('Term')[['es','pval','fdr']].rename(columns={'es':'down_es','pval':'down_pval','fdr':'down_fdr'}))
up_gsea_report = pd.concat(up_reports, axis=0)
down_gsea_report = pd.concat(down_reports, axis=0)
gsea_sgn_df = pd.concat([udrg_sgn_df, up_gsea_report, down_gsea_report], axis=1, join_axes=[udrg_sgn_df.index])
io.write_df(gsea_sgn_df, '%s_udrg_gsea'%RUN_LABEL.replace(' ', '_'), with_idx=True)
gsea_sgn_df.to_excel('%s_udrg_gsea.xlsx'%RUN_LABEL.replace(' ', '_'), encoding='utf8')
In [ ]:
wkpw_species, wkpw_annots, wkpw_genes = [[] for x in range(3)]
for fpath in fs.listf(WIKIPATHWAYS_PATH, pattern='.*\.gmt', full_path=True):
lines = [l.strip('\n').split('\t') for l in fs.read_file(fpath)]
annots, genes = zip(*[(l[:2], l[2:]) for l in lines])
annots, genes = list(annots), list(genes)
wkpw_species.append(os.path.splitext(os.path.basename(fpath))[0].lower().replace(' ', '_')), wkpw_annots.append(list(annots)), wkpw_genes.append(list(genes))
In [ ]:
from bionlp.spider import geoxml as geo
from cStringIO import StringIO
pred_species, pred_gses, pred_gpls, pred_annots, pred_genes = [[] for x in range(5)]
for fpath in fs.listf(SGNDB_PATH, pattern='.*\.gmt', full_path=True):
lines = [l.strip('\n').split('\t') for l in fs.read_file(fpath)]
annots, genes = zip(*[(l[:2], l[2:]) for l in lines])
annots, genes = list(annots), list(genes)
species = annots[0][1].split(':')[0].lower().replace(' ', '_')
gse_id = os.path.splitext(os.path.basename(fpath))[0].split('_')[0].lower().replace(' ', '_')
gse_doc = geo.parse_geo(os.path.join(GSE_DIR, '%s.xml'%gse_id.upper()), type='gse')
pred_species.append(species), pred_gses.append(gse_id), pred_gpls.append(geo.parse_geo(os.path.join(SAMP_DIR, '%s.xml'%gse_doc['samples'][0]), type='gsm')['platform']), pred_annots.append(list(annots)), pred_genes.append(list(genes))
In [ ]:
def gs_ol(species, gses, gpls, genes, ref_species, ref_genes):
try:
pgmap = io.read_obj(os.path.join(PLATFORM_PATH, 'probe_gene_map.pkl'))
except Exception as e:
pgmap = None
for species, gse, gpl, gss in zip(species, gses, gpls, genes):
has_pgmap = pgmap is not None and pgmap.has_key(gpl)
try:
spcs_idx = ref_species.index(species)
except ValueError as e:
print e
continue
for ref_gs in ref_genes[spcs_idx]:
for gs in gss:
if (len(gs) == 0 or not gs[0]): continue
if (has_pgmap):
gs = func.flatten_list(map(lambda x: pgmap[gpl].loc[x].split(SC) if x and x in pgmap[gpl].index else x, gs))
gs = [x if x.strip() != '///' else 0 for x in gs]
gs = [x for x in gs if float(x) != 0]
gs_sim = difflib.SequenceMatcher(None, gs, ref_gs).ratio()
if (gs_sim > 0.2): print 'Found %f%% similar gene set with size %i in series %s' % (gs_sim, len(gs), gse)
numprocs = psutil.cpu_count()
task_bnd = njobs.split_1d(len(pred_gses), split_num=numprocs, ret_idx=True)
# gs_ol(pred_species, pred_gses, pred_gpls, pred_genes, ref_species=wkpw_species, ref_genes=wkpw_genes)
_ = njobs.run_pool(gs_ol, n_jobs=numprocs, dist_param=['species', 'gses', 'gpls', 'genes'], species=[pred_species[task_bnd[i]:task_bnd[i+1]] for i in range(numprocs)], gses=[pred_gses[task_bnd[i]:task_bnd[i+1]] for i in range(numprocs)], gpls=[pred_gpls[task_bnd[i]:task_bnd[i+1]] for i in range(numprocs)], genes=[pred_genes[task_bnd[i]:task_bnd[i+1]] for i in range(numprocs)], ref_species=wkpw_species, ref_genes=wkpw_genes)
In [ ]:
GSE_ID = 'GSE48301'
dge_X = io.read_df(os.path.join(DGE_DATA_PATH, 'dge_X_%s.npz'%GSE_ID), with_idx=True)
dge_Y = io.read_df(os.path.join(DGE_DATA_PATH, 'dge_Y_%s.npz'%GSE_ID), with_idx=True, sparse_fmt='csr')
In [ ]:
sub_dge_dfs=[io.read_df(os.path.join(DGE_PATH, fname), with_idx=True) for fname in ['dge_%s.npz'%x for x in range(71, 107)]]
In [ ]:
set(sub_dge_dfs[0].index) & set(sub_dge_dfs[5].index)
In [ ]:
def gen_mdls(tuned=False, glb_clfnames=[], **kwargs):
clf_names = []
for clf_name, clf in [
('RandomForest', Pipeline([('clf', func.build_model(RandomForestClassifier, 'Classifier', 'Random Forest', mltl=True, mltp=True, n_jobs=1, random_state=0))]))
]:
yield clf_name, clf
clf_names.append(clf_name)
if (len(glb_clfnames) < len(clf_names)):
del glb_clfnames[:]
glb_clfnames.extend(clf_names)
txtclf.cross_validate(dge_X, dge_Y, gen_mdls, model_param=dict(tuned=False, glb_filtnames=[], glb_clfnames=[]), avg='micro', kfold=3, global_param=dict(comb=True, pl_names=[], pl_set=set([])), lbid=-1)